player_PGstats_2021.csv – NBA players statistics per game in
2021-2022 season
source: https://www.basketball-reference.com/leagues/NBA_2022_per_game.html
player_Adstats_2021.csv – NBA players advance statistics in 2021-2022
season source: https://www.basketball-reference.com/leagues/NBA_2022_advanced.html
salary2022.csv – NBA players contract in 2022-2023 season onward
source: https://www.basketball-reference.com/contracts/players.html
library(knitr)
library(plyr)
library(dplyr)
library(tidyr)
library(caret)
library(ggplot2)
library(corrplot)
library(stringr)
library(scales)
library(randomForest)
library(psych)
library(glmnet)
library(rpart)
library(lubridate)
library(plotly)
library(forcats)
library(ggExtra)
opts_chunk$set(echo = TRUE, cache = TRUE)
opts_chunk$set(tidy.opts = list(width.cutoff = 60), tidy = TRUE, fig.height = 6, fig.width = 9)
pgstats <- read.csv("files/2022/player_PGstats_2021.csv")
adstats <- read.csv("files/2022/player_Adstats_2021.csv")
salary <- read.csv("files/2022/salary2022.csv")
NBA players statistics per game in 2021-2022 season
source: https://www.basketball-reference.com/leagues/NBA_2022_per_game.html
dim(pgstats)
## [1] 812 31
str(pgstats)
## 'data.frame': 812 obs. of 31 variables:
## $ Rk : int 1 2 3 4 5 6 6 6 7 8 ...
## $ Player : chr "Precious Achiuwa" "Steven Adams" "Bam Adebayo" "Santi Aldama" ...
## $ Pos : chr "C" "C" "C" "PF" ...
## $ Age : int 22 28 24 21 36 23 23 23 26 23 ...
## $ Tm : chr "TOR" "MEM" "MIA" "MEM" ...
## $ G : int 73 76 56 32 47 65 50 15 66 56 ...
## $ GS : int 28 75 56 0 12 21 19 2 61 56 ...
## $ MP : num 23.6 26.3 32.6 11.3 22.3 22.6 26.3 9.9 27.3 32.3 ...
## $ FG : num 3.6 2.8 7.3 1.7 5.4 3.9 4.7 1.1 3.9 6.6 ...
## $ FGA : num 8.3 5.1 13 4.1 9.7 10.5 12.6 3.2 8.6 9.7 ...
## $ FG. : num 0.439 0.547 0.557 0.402 0.55 0.372 0.375 0.333 0.448 0.677 ...
## $ X3P : num 0.8 0 0 0.2 0.3 1.6 1.9 0.7 2.4 0 ...
## $ X3PA : num 2.1 0 0.1 1.5 1 5.2 6.1 2.2 5.9 0.2 ...
## $ X3P. : num 0.359 0 0 0.125 0.304 0.311 0.311 0.303 0.409 0.1 ...
## $ X2P : num 2.9 2.8 7.3 1.5 5.1 2.3 2.8 0.4 1.5 6.6 ...
## $ X2PA : num 6.1 5 12.9 2.6 8.8 5.3 6.5 1 2.7 9.6 ...
## $ X2P. : num 0.468 0.548 0.562 0.56 0.578 0.433 0.434 0.4 0.533 0.688 ...
## $ eFG. : num 0.486 0.547 0.557 0.424 0.566 0.449 0.45 0.438 0.588 0.678 ...
## $ FT : num 1.1 1.4 4.6 0.6 1.9 1.2 1.4 0.7 1 2.9 ...
## $ FTA : num 1.8 2.6 6.1 1 2.2 1.7 1.9 0.8 1.1 4.2 ...
## $ FT. : num 0.595 0.543 0.753 0.625 0.873 0.743 0.722 0.917 0.865 0.708 ...
## $ ORB : num 2 4.6 2.4 1 1.6 0.6 0.7 0.1 0.5 3.4 ...
## $ DRB : num 4.5 5.4 7.6 1.7 3.9 2.3 2.6 1.5 2.9 7.3 ...
## $ TRB : num 6.5 10 10.1 2.7 5.5 2.9 3.3 1.5 3.4 10.8 ...
## $ AST : num 1.1 3.4 3.4 0.7 0.9 2.4 2.8 1.1 1.5 1.6 ...
## $ STL : num 0.5 0.9 1.4 0.2 0.3 0.7 0.8 0.3 0.7 0.8 ...
## $ BLK : num 0.6 0.8 0.8 0.3 1 0.4 0.4 0.3 0.3 1.3 ...
## $ TOV : num 1.2 1.5 2.6 0.5 0.9 1.4 1.7 0.5 0.7 1.7 ...
## $ PF : num 2.1 2 3.1 1.1 1.7 1.6 1.8 1 1.5 1.7 ...
## $ PTS : num 9.1 6.9 19.1 4.1 12.9 10.6 12.8 3.5 11.1 16.1 ...
## $ player_id: chr "achiupr01" "adamsst01" "adebaba01" "aldamsa01" ...
player_Adstats_2021.csv – NBA players advance statistics in 2021-2022 season source: https://www.basketball-reference.com/leagues/NBA_2022_advanced.html
dim(adstats)
## [1] 812 30
str(adstats)
## 'data.frame': 812 obs. of 30 variables:
## $ Rk : int 1 2 3 4 5 6 6 6 7 8 ...
## $ Player : chr "Precious Achiuwa" "Steven Adams" "Bam Adebayo" "Santi Aldama" ...
## $ Pos : chr "C" "C" "C" "PF" ...
## $ Age : int 22 28 24 21 36 23 23 23 26 23 ...
## $ Tm : chr "TOR" "MEM" "MIA" "MEM" ...
## $ G : int 73 76 56 32 47 65 50 15 66 56 ...
## $ MP : int 1725 1999 1825 360 1050 1466 1317 149 1805 1809 ...
## $ PER : num 12.7 17.6 21.8 10.2 19.6 10.5 10.5 10.2 12.7 23 ...
## $ TS. : num 0.503 0.56 0.608 0.452 0.604 0.475 0.474 0.497 0.609 0.698 ...
## $ X3PAr : num 0.259 0.003 0.008 0.364 0.1 0.497 0.483 0.688 0.684 0.018 ...
## $ FTr : num 0.217 0.518 0.466 0.242 0.223 0.16 0.153 0.25 0.13 0.428 ...
## $ ORB. : num 8.7 17.9 8.7 9.4 7.8 2.7 3 0.8 1.9 12 ...
## $ DRB. : num 21.7 22 26.1 16.1 18.7 11.5 11 15.6 10.9 24.5 ...
## $ TRB. : num 14.9 19.9 17.5 12.6 13.4 7.1 6.9 8.5 6.5 18.4 ...
## $ AST. : num 6.9 16.1 17.5 7.7 6.3 16.1 16.1 15.5 7.6 8.2 ...
## $ STL. : num 1.1 1.6 2.2 0.8 0.6 1.5 1.5 1.7 1.2 1.2 ...
## $ BLK. : num 2.3 2.7 2.6 2.5 4 1.5 1.4 2.4 1 3.7 ...
## $ TOV. : num 11.3 19.6 14.4 9.9 8 11.3 11.2 13.1 6.7 12.7 ...
## $ USG. : num 18.5 12 25 18.4 22.4 24.1 24.8 17.9 15.2 18.1 ...
## $ X : logi NA NA NA NA NA NA ...
## $ OWS : num 0.4 3.8 3.6 -0.1 2.1 -1.1 -1.1 0 2.8 5.4 ...
## $ DWS : num 2.1 3 3.5 0.4 1 1.1 0.9 0.2 1.4 3 ...
## $ WS : num 2.5 6.8 7.2 0.3 3.1 0.1 -0.1 0.2 4.2 8.5 ...
## $ WS.48 : num 0.07 0.163 0.188 0.044 0.141 0.003 -0.005 0.07 0.11 0.225 ...
## $ X.1 : logi NA NA NA NA NA NA ...
## $ OBPM : num -2 1 1.7 -4.2 1.3 -1.8 -1.7 -2.9 0.6 2.7 ...
## $ DBPM : num -0.6 1 2.1 -1.5 -0.6 -1.1 -1.3 1.2 -0.2 1.2 ...
## $ BPM : num -2.6 2 3.8 -5.7 0.7 -2.9 -3 -1.7 0.4 3.9 ...
## $ VORP : num -0.2 2 2.7 -0.3 0.7 -0.3 -0.3 0 1.1 2.7 ...
## $ player_id: chr "achiupr01" "adamsst01" "adebaba01" "aldamsa01" ...
salary2022.csv – NBA players contract in 2022-2023 season
onward
source: https://www.basketball-reference.com/contracts/players.html
dim(salary)
## [1] 448 12
str(salary)
## 'data.frame': 448 obs. of 12 variables:
## $ Rk : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Player : chr "Stephen Curry" "Russell Westbrook" "LeBron James" "Kevin Durant" ...
## $ Tm : chr "GSW" "LAL" "LAL" "BRK" ...
## $ X2022.23 : chr "$48070014" "$47063478" "$44474988" "$44119845" ...
## $ X2023.24 : chr "$51915615" "" "" "$46407433" ...
## $ X2024.25 : chr "$55761216" "" "" "$49856021" ...
## $ X2025.26 : chr "$59606817" "" "" "$53282609" ...
## $ X2026.27 : chr "" "" "" "" ...
## $ X2027.28 : chr "" "" "" "" ...
## $ Signed.Using: chr "Bird" "Bird Rights" "Bird" "Bird" ...
## $ Guaranteed : chr "$215353662" "$47063478" "$44474988" "$193665908" ...
## $ player_id : chr "curryst01" "westbru01" "jamesle01" "duranke01" ...
I will merge the tables by their primary key (pgstats.player_id) and foreign key (salary.player_id) by inner join (only take the entries which exist). I will treat the players that received salary but have not played any game as outliers.
merged <- merge(pgstats, adstats, by = c("player_id", "Tm"))
Since there is players that has changed team in the middle of the season, I will merge the stats together by taking the weighted mean of per game stats and sum of cumulative stats. I will find the variable where both are always the same.
pgName <- names(pgstats)
same <- c()
for (i in 1:length(pgName)) {
if (!pgName[i] %in% names(adstats)) {
same[i] <- FALSE
next
}
if (all(pgstats[, pgName[i]] == adstats[, pgName[i]])) {
same[i] <- TRUE
} else {
same[i] <- FALSE
}
}
pgName[same]
## [1] "Rk" "Player" "Pos" "Age" "Tm" "G"
## [7] "player_id"
For the team variable, I will take the last team it was in, as it should have the largest effect
merged <- merged %>%
group_by(player_id) %>%
summarise(Tm = Tm[length(Tm)], Rk = Rk.x[1], Player = Player.x[1],
Position = Pos.x[1], Age = Age.x[1], Game_played = sum(G.x),
Game_started = sum(GS), MP = sum(MP.y)/sum(G.x), FG = weighted.mean(FG,
G.x), FGA = weighted.mean(FGA, G.x), FGpct = weighted.mean(FG.,
G.x), X3P = weighted.mean(X3P, G.x), X3PA = weighted.mean(X3PA,
G.x), X3Ppct = weighted.mean(X3P., G.x), X2P = weighted.mean(X2P,
G.x), X2PA = weighted.mean(X2PA, G.x), X2Ppct = weighted.mean(X2P.,
G.x), eFGpct = weighted.mean(eFG., G.x), FT = weighted.mean(FT,
G.x), FTA = weighted.mean(FTA, G.x), FTpct = weighted.mean(FT.,
G.x), ORB = weighted.mean(ORB, G.x), DRB = weighted.mean(DRB,
G.x), AST = weighted.mean(AST, G.x), STL = weighted.mean(STL,
G.x), BLK = weighted.mean(BLK, G.x), TOV = weighted.mean(TOV,
G.x), PF = weighted.mean(PF, G.x), PTS = weighted.mean(PTS,
G.x), PER = weighted.mean(PER, MP.x), TSpct = weighted.mean(TS.,
G.x), X3PAr = weighted.mean(X3PAr, G.x), FTr = weighted.mean(FTr,
G.x), ORBpct = weighted.mean(ORB., G.x), DRBpct = weighted.mean(DRB.,
G.x), TRBpct = weighted.mean(TRB., G.x), ASTpct = weighted.mean(AST.,
G.x), STLpct = weighted.mean(STL., G.x), BLKpct = weighted.mean(BLK.,
G.x), TOVpct = weighted.mean(TOV., G.x), USGpct = weighted.mean(USG.,
G.x), OWS = sum(OWS), DWS = sum(DWS), WS = sum(WS),
WSper48 = weighted.mean(WS.48, MP.x), OBPM = weighted.mean(OBPM,
G.x), DBPM = weighted.mean(DBPM, G.x), BPM = weighted.mean(BPM,
G.x), VORP = mean(VORP))
merged <- merge(merged, salary, by = c("player_id"))
write.csv(merged, "dataset/all2022.csv")
all <- read.csv("dataset/all2022.csv")
dim(all)
## [1] 390 62
There are 390 entries.
str(all)
## 'data.frame': 390 obs. of 62 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ player_id : chr "achiupr01" "adamsst01" "adebaba01" "aldamsa01" ...
## $ Tm.x : chr "TOR" "MEM" "MIA" "MEM" ...
## $ Rk.x : int 1 2 3 4 6 7 8 9 11 12 ...
## $ Player.x : chr "Precious Achiuwa" "Steven Adams" "Bam Adebayo" "Santi Aldama" ...
## $ Position : chr "C" "C" "C" "PF" ...
## $ Age : int 22 28 24 21 23 26 23 23 28 27 ...
## $ Game_played : int 73 76 56 32 130 66 56 54 69 67 ...
## $ Game_started: int 28 75 56 0 42 61 56 1 11 67 ...
## $ MP : num 23.6 26.3 32.6 11.2 22.6 ...
## $ FG : num 3.6 2.8 7.3 1.7 3.88 ...
## $ FGA : num 8.3 5.1 13 4.1 10.5 ...
## $ FGpct : num 0.439 0.547 0.557 0.402 0.369 ...
## $ X3P : num 0.8 0 0 0.2 1.61 ...
## $ X3PA : num 2.1 0 0.1 1.5 5.2 5.9 0.2 2 1.6 3.6 ...
## $ X3Ppct : num 0.359 0 0 0.125 0.31 ...
## $ X2P : num 2.9 2.8 7.3 1.5 2.27 ...
## $ X2PA : num 6.1 5 12.9 2.6 5.27 ...
## $ X2Ppct : num 0.468 0.548 0.562 0.56 0.43 ...
## $ eFGpct : num 0.486 0.547 0.557 0.424 0.448 ...
## $ FT : num 1.1 1.4 4.6 0.6 1.22 ...
## $ FTA : num 1.8 2.6 6.1 1 1.67 ...
## $ FTpct : num 0.595 0.543 0.753 0.625 0.755 0.865 0.708 0.679 0.638 0.722 ...
## $ ORB : num 2 4.6 2.4 1 0.581 ...
## $ DRB : num 4.5 5.4 7.6 1.7 2.32 ...
## $ AST : num 1.1 3.4 3.4 0.7 2.4 ...
## $ STL : num 0.5 0.9 1.4 0.2 0.692 ...
## $ BLK : num 0.6 0.8 0.8 0.3 0.388 ...
## $ TOV : num 1.2 1.5 2.6 0.5 1.41 ...
## $ PF : num 2.1 2 3.1 1.1 1.61 ...
## $ PTS : num 9.1 6.9 19.1 4.1 10.6 ...
## $ PER : num 12.7 17.6 21.8 10.2 10.4 ...
## $ TSpct : num 0.503 0.56 0.608 0.452 0.477 ...
## $ X3PAr : num 0.259 0.003 0.008 0.364 0.514 ...
## $ FTr : num 0.217 0.518 0.466 0.242 0.168 ...
## $ ORBpct : num 8.7 17.9 8.7 9.4 2.6 ...
## $ DRBpct : num 21.7 22 26.1 16.1 11.8 ...
## $ TRBpct : num 14.9 19.9 17.5 12.6 7.18 ...
## $ ASTpct : num 6.9 16.1 17.5 7.7 16 ...
## $ STLpct : num 1.1 1.6 2.2 0.8 1.52 ...
## $ BLKpct : num 2.3 2.7 2.6 2.5 1.57 ...
## $ TOVpct : num 11.3 19.6 14.4 9.9 11.5 ...
## $ USGpct : num 18.5 12 25 18.4 23.7 ...
## $ OWS : num 0.4 3.8 3.6 -0.1 -2.2 2.8 5.4 1 1 9.2 ...
## $ DWS : num 2.1 3 3.5 0.4 2.2 1.4 3 1.1 2.5 3.7 ...
## $ WS : num 2.5 6.8 7.2 0.3 0.2 4.2 8.5 2.1 3.5 12.9 ...
## $ WSper48 : num 0.07 0.163 0.188 0.044 0.0107 ...
## $ OBPM : num -2 1 1.7 -4.2 -1.89 ...
## $ DBPM : num -0.6 1 2.1 -1.5 -0.912 ...
## $ BPM : num -2.6 2 3.8 -5.7 -2.8 0.4 3.9 1.9 1.2 11.2 ...
## $ VORP : num -0.2 2 2.7 -0.3 -0.2 1.1 2.7 0.8 1.2 7.4 ...
## $ Rk.y : int 301 76 33 349 226 165 62 418 159 6 ...
## $ Player.y : chr "Precious Achiuwa" "Steven Adams" "Bam Adebayo" "Santi Aldama" ...
## $ Tm.y : chr "TOR" "MEM" "MIA" "MEM" ...
## $ X2022.23 : chr "$2840160" "$17926829" "$30351834" "$2094240" ...
## $ X2023.24 : chr "$4379527" "" "$32600118" "$2194200" ...
## $ X2024.25 : chr "" "" "$34848402" "$3960531" ...
## $ X2025.26 : chr "" "" "$37096686" "" ...
## $ X2026.27 : chr "" "" "" "" ...
## $ X2027.28 : chr "" "" "" "" ...
## $ Signed.Using: chr "1st Round Pick" "1st Round Pick" "Bird" "1st Round Pick" ...
## $ Guaranteed : chr "$2840160" "$17926829" "$134897040" "$2094240" ...
sum(all$Player.x != all$Player.y)
## [1] 0
Since there is no difference in the player name, I will remove player.y and renaming player.x to name
all <- all %>%
select(!Player.y) %>%
rename(name = Player.x)
It is rank in their original respective table (alphabetical order of
player name in player statistics tables and salary in 2022-23 season for
salary table).
Since, it doesn’t carry any extra information, I will remove both of the
variables.
all <- all %>%
select(!c(Rk.x, Rk.y))
Tm.x is the team the player in in 2021-22 season while Tm.y is the team of 2022-23 season. I will change Tm.x to team_2021 and Tm.y to team_2022.
all <- all %>%
rename(team_2021 = Tm.x, team_2022 = Tm.y)
The aim of this project is to predict the salary next year. I will remove the salary of 2023-24 season onward and change the X2022.23 to numeric variables.
all <- all %>%
select(!c(X2023.24, X2024.25, X2025.26, X2026.27, X2027.28)) %>%
rename(salary = X2022.23) %>%
mutate(salary = as.numeric(str_extract(salary, "[0-9]+")))
plot_ly(data = all, x = ~salary, type = "histogram", nbinsx = 30) %>%
layout(title = "Frequency Diagram of NBA salary in 2022-23 season",
xaxis = list(title = "yearly salary (USD)"), yaxis = list(title = "frequency"))
## Warning: Ignoring 1 observations
The salary is highly right skewed and there is high frequency concentrated on 0 to 2 million range. This might be because of the existence of minimum salary in NBA, which is 1 million to 3 million per year depending on their experience (Adams, 2022). I will keep that in mind.
summary(all$salary)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 333333 2201400 5954454 10155507 14004703 48070014 1
I will first use the correlation with salary to get a feel on the numeric variables on the response variables
numVar <- which(sapply(all, is.numeric))
numVarNames <- names(numVar)
length(numVarNames)
## [1] 47
There are 47 numeric variables
all_numVar <- all[, numVar]
all_numVar <- select(all_numVar, !X)
cor_Mat <- cor(all_numVar, use = "pairwise.complete.obs")
cor_names <- names(sort(cor_Mat[, "salary"], decreasing = TRUE))[1:20]
cor_Mat <- cor_Mat[cor_names, cor_names]
corrplot.mixed(cor_Mat, tl.pos = "lt")
Pts:
Points per game
It has the highest correlation with salary among the numeric variables (0.7908116). It is the average point per game played.
ggplotly(ggplot(all %>%
drop_na(PTS, salary), aes(x = PTS, y = salary)) + geom_point(col = "blue") +
geom_smooth(formula = y ~ x, method = "loess") + labs(title = "points per game in NBA 2021-22 vs salary in NBA 2022-23",
x = "points per game", y = "yearly salary (USD)"))
There is a clear linear correlation between salary and points per game. The correlation is smaller when the points per game is below about 9 but increase after it goes above 9 points.
ggplotly(ggplot(all %>%
drop_na(PTS, salary), aes(x = PTS)) + geom_histogram(bins = 30) +
labs(title = "Frequency Distribution of points per game in NBA 2021-22",
x = "points per game", y = "Frequency"))
Value Over Replacement Player:
VORP - Value Over Replacement Player (available since the 1973-74 season in the NBA); a box score estimate of the points per 100 TEAM possessions that a player contributed above a replacement-level (-2.0) player, translated to an average team and prorated to an 82-game season.
Although FG, FGA, FT etc are more highly correlated, they are also highly correlated to points per game (> 0.75). I will look at the next one that is not highly correlated to points per game. It has a correlation of (0.6619577) with salary.
ggplotly(ggplot(all %>%
drop_na(VORP, salary), aes(x = VORP, y = salary)) + geom_point(col = "blue") +
geom_smooth(formula = y ~ x, method = "loess") + geom_smooth(formula = y ~
x, method = "glm", linetype = "dotted", col = "red", se = FALSE) +
labs(title = "Value over replacement player in NBA 2021-22 vs salary in NBA 2022-23",
x = "VORP", y = "yearly salary (USD)"))
It shows clear linear correlation except some in both extreme of the VORP.
Assists:
Assists per game
It has a high correlation with salary while not a having such a high correlation with points per game. It has a correlation of (0.6114318) with salary.
ggplotly(ggplot(all %>%
drop_na(AST, salary), aes(x = AST, y = salary)) + geom_point(col = "blue") +
geom_smooth(formula = y ~ x, method = "loess") + geom_smooth(formula = y ~
x, method = "glm", linetype = "dotted", col = "red", se = FALSE) +
labs(title = "Assists per game in NBA 2021-22 vs salary in NBA 2022-23",
x = "Assists per game", y = "yearly salary (USD)"))
It show positive correlation until it goes above 6 assists per game where it shows negative correlation. This maybe explained by that the players with high assist are usually not the first attacking choice of the team which might explain by they are pay less.
Nacol <- names(which(colSums(is.na(all) | all == "") > 0))
sort(colSums(sapply(all[Nacol], function(x) is.na(x) | x == "")),
decreasing = TRUE)
## Signed.Using Guaranteed X3Ppct FTpct X2Ppct FGpct
## 51 37 18 9 6 2
## eFGpct TSpct X3PAr FTr TOVpct salary
## 2 2 2 2 2 1
kable(all[is.na(all$salary), c("X", "name", "salary")])
| X | name | salary | |
|---|---|---|---|
| 366 | 366 | Ish Wainright | NA |
The salary of Ish Wainright is 125000 USD spotrac (n.d.).
all$salary[all$X == 366] <- 125000
Signed.Using:
The type of contract use to sign
I will impute by changing all NA to “None”.
unique(all$Signed.Using)
## [1] "1st Round Pick" "Bird" "MLE"
## [4] "Minimum Salary" "" "Sign and Trade"
## [7] "Bird Rights" "Early Bird" "Cap Space"
## [10] "1st round pick" "Mini MLE" "1st Round pick"
## [13] "Bi-Annual Exception" "Non Bird" "Cap space"
## [16] "Room Exception"
all$Signed.Using[grep("^1st [Rr]ound [Pp]ick", all$Signed.Using)] <- "1st round pick"
all$Signed.Using[grep("Cap [Ss]pace", all$Signed.Using)] <- "Cap space"
all$Signed.Using[is.na(all$Signed.Using) | all$Signed.Using ==
""] <- "None"
ggplotly(ggplot(all, aes(x = fct_reorder(as.factor(Signed.Using),
salary, .fun = "mean"), y = salary, fill = Signed.Using)) +
geom_boxplot() + geom_point(stat = "summary", fun = "mean") +
labs(title = "Type of contract vs salary", x = "Type of contract",
y = "yearly salary (USD)") + theme(axis.text.x = element_text(angle = 45,
hjust = 1)))
Guaranteed:
The amount of a player's remaining salary that is guarenteed.
Since it is a direct indication of the salary, I will remove this variable.
all <- select(all, !Guaranteed)
X3Ppct:
3 point field goal percentage
kable(all[which(is.na(all$X3Ppct)), c("X3P", "X3PA", "X3Ppct")])
| X3P | X3PA | X3Ppct | |
|---|---|---|---|
| 16 | 0.0000000 | 0.0000000 | NA |
| 39 | 0.0000000 | 0.0000000 | NA |
| 49 | 0.0000000 | 0.0000000 | NA |
| 58 | 0.0000000 | 0.0000000 | NA |
| 72 | 0.0000000 | 0.0000000 | NA |
| 81 | 0.0000000 | 0.0000000 | NA |
| 121 | 0.2942308 | 1.0807692 | NA |
| 195 | 0.0000000 | 0.0000000 | NA |
| 196 | 0.0000000 | 0.0000000 | NA |
| 204 | 0.0000000 | 0.3033333 | NA |
| 257 | 0.5000000 | 1.0000000 | NA |
| 277 | 0.0000000 | 0.0000000 | NA |
| 310 | 0.0000000 | 0.0000000 | NA |
| 313 | 0.0000000 | 0.0000000 | NA |
| 327 | 0.0000000 | 0.0000000 | NA |
| 331 | 0.0000000 | 0.0000000 | NA |
| 341 | 0.0000000 | 0.0000000 | NA |
| 390 | 0.0000000 | 0.0000000 | NA |
I will impute by setting to 0 if there is no 3 point attempt.
all$X3Ppct[which(is.na(all$X3Ppct))] <- sapply(which(is.na(all$X3Ppct)),
function(x) ifelse(all$X3PA[x] == 0, 0, all$X3P[x]/all$X3PA[x]))
ggplotly(ggplot(all, aes(x = X3Ppct, y = salary)) + geom_point() +
geom_smooth(col = "red", formula = y ~ x, method = "glm") +
labs(title = "3 point percentage vs salary", x = "3 point field goal percentage",
y = "yearly salary (USD)"))
It shows a slight but not significant correlation between salary and 3 point percentage.
ggplotly(ggplot(all, aes(x = X3Ppct, y = salary, col = Position)) +
geom_point() + geom_smooth(formula = y ~ x, method = "glm") +
facet_grid(Position ~ .) + labs(title = "3 point percentage vs salary",
x = "3 point field goal percentage", y = "yearly salary (USD)"))
FTpct:
Free throw percentage
kable(all[which(is.na(all$FTpct)), c("FT", "FTA", "FTpct")])
| FT | FTA | FTpct | |
|---|---|---|---|
| 115 | 0.0000000 | 0.0000000 | NA |
| 121 | 0.9730769 | 1.6115385 | NA |
| 154 | 0.0000000 | 0.0000000 | NA |
| 169 | 1.0369048 | 1.4047619 | NA |
| 204 | 0.2966667 | 0.4200000 | NA |
| 222 | 0.2166667 | 0.2166667 | NA |
| 257 | 0.0000000 | 0.0000000 | NA |
| 331 | 0.0000000 | 0.0000000 | NA |
| 341 | 0.0000000 | 0.0000000 | NA |
I will impute by setting to 0 if there is no free throw attempt.
all$FTpct[which(is.na(all$FTpct))] <- sapply(which(is.na(all$FTpct)),
function(x) ifelse(all$FTA[x] == 0, 0, all$FT[x]/all$FTA[x]))
ggplotly(ggplot(all, aes(x = FTpct, y = salary)) + geom_point() +
geom_smooth(formula = y ~ x, method = "glm") + labs(title = "Free throw percentage vs salary",
x = "free throw percentage", y = "yearly salary (USD)"))
2-Point Field Goal Percentage
kable(all[is.na(all$X2Ppct), c("name", "X2P", "X2PA", "X2Ppct")])
| name | X2P | X2PA | X2Ppct | |
|---|---|---|---|---|
| 121 | Wenyen Gabriel | 1.7980769 | 3.175000 | NA |
| 169 | Danuel House Jr. | 0.8595238 | 1.902381 | NA |
| 222 | Didi Louzada | 0.3055556 | 1.094444 | NA |
| 257 | Juwan Morgan | 0.5000000 | 0.500000 | NA |
| 332 | Nik Stauskas | 0.1125000 | 0.500000 | NA |
| 355 | Rayjon Tucker | 0.4000000 | 0.800000 | NA |
all$X2Ppct[is.na(all$X2Ppct)] <- all$X2P[is.na(all$X2Ppct)]/all$X2PA[is.na(all$X2Ppct)]
Field Goal Percentage
kable(all[is.na(all$FGpct), c("FG", "FGA", "FGpct")])
| FG | FGA | FGpct | |
|---|---|---|---|
| 121 | 2.092308 | 4.217308 | NA |
| 257 | 1.000000 | 1.500000 | NA |
all$FGpct[is.na(all$FGpct)] <- all$FG[is.na(all$FGpct)]/all$FGA[is.na(all$FGpct)]
Effective Field Goal Percentage:
This statistics adjusts for the fact that 3-point field goal is worth one more point than 2-point field goal.
The formula for the effective field goal percentage is (Basketball_Reference, n.d.):
\[
\frac{\text{2-Point Field Goals} + 1.5 \cdot \text{3-Point Field
Goals}}{\text{Total Field Goal Attempts}}
\]
kable(all[which(is.na(all$eFGpct)), c("X2P", "X3P", "FGA", "eFGpct")])
| X2P | X3P | FGA | eFGpct | |
|---|---|---|---|---|
| 121 | 1.798077 | 0.2942308 | 4.217308 | NA |
| 257 | 0.500000 | 0.5000000 | 1.500000 | NA |
all$eFGpct[is.na(all$eFGpct)] <- (all$X2P[is.na(all$eFGpct)] +
1.5 * all$X3P[is.na(all$eFGpct)])/all$FGA[is.na(all$eFGpct)]
ggplotly(ggplot(all, aes(x = eFGpct, y = salary)) + geom_point() +
geom_smooth(formula = y ~ x, method = "glm") + labs(title = "Effect field goal percentage vs salary",
x = "effective field goal percentage", y = "yearly salary (USD)"))
True Shooting percentage:
True shooting percentage is a measure of shooting efficiency that takes into account field goals, 3-point field goals, and free throws.
Formula (Basketball_Reference, n.d.): \[ \frac{\text{Points}}{2\cdot(\text{Field Goal Attempts}+0.44\cdot\text{Free Throw Attempts})} \]
kable(all[is.na(all$TSpct), c("PTS", "FGA", "FTA")])
| PTS | FGA | FTA | |
|---|---|---|---|
| 121 | 5.413462 | 4.217308 | 1.611538 |
| 257 | 2.500000 | 1.500000 | 0.000000 |
all$TSpct[is.na(all$TSpct)] <- all$PTS[is.na(all$TSpct)]/(2 *
all$FGA[is.na(all$TSpct)] + 2 * 0.44 * all$FTA[is.na(all$TSpct)])
ggplotly(ggplot(all, aes(x = TSpct, y = salary)) + geom_point() +
geom_smooth(formula = y ~ x, method = "glm") + labs(title = "True shooting percentage vs salary",
x = "True shooting percentage", y = "yearly salary (USD)"))
3-Point Attempt rate
Percentage of field goal attempt from 3-point range
kable(all[is.na(all$X3PAr), c("X3PA", "FGA", "X3PAr")])
| X3PA | FGA | X3PAr | |
|---|---|---|---|
| 121 | 1.080769 | 4.217308 | NA |
| 257 | 1.000000 | 1.500000 | NA |
all$X3PAr[is.na(all$X3PAr)] <- all$X3PA[is.na(all$X3PAr)]/all$FGA[is.na(all$X3PAr)]
Free Throw Attempt Rate:
Number of free throw attempts per field goal attempt
kable(all[is.na(all$FTr), c("FTA", "FGA", "FTr")])
| FTA | FGA | FTr | |
|---|---|---|---|
| 121 | 1.611538 | 4.217308 | NA |
| 257 | 0.000000 | 1.500000 | NA |
all$FTr[is.na(all$FTr)] <- all$FTA[is.na(all$FTr)]/all$FGA[is.na(all$FTr)]
Turnover percentage:
An estimate of turnovers committed per 100 plays
kable(all[is.na(all$TOVpct), c("X", "name", "TOV", "TOVpct")])
| X | name | TOV | TOVpct | |
|---|---|---|---|---|
| 121 | 121 | Wenyen Gabriel | 0.6134615 | NA |
| 257 | 257 | Juwan Morgan | 0.0000000 | NA |
I will manually impute the data by finding the weighted mean of the turnover percentage by minute played from the original data set (before merging)
all$TOVpct[c(121, 257)] <- c(11.1, 0)
Find all character variables:
chrVar <- names(which(sapply(all, is.character)))
chrVar
## [1] "player_id" "team_2021" "name" "team_2022" "Signed.Using"
I will keep player id and name for now to keep track of each entries but will remove it before fitting the model
player_name <- all$name
all <- select(all, !c(name, player_id))
ggplotly(ggplot(all, aes(x = fct_reorder(as.factor(team_2021),
salary, median, .desc = TRUE), y = salary, fill = reorder(as.factor(team_2021),
salary, .fun = "mean", decreasing = TRUE))) + geom_boxplot() +
labs(title = "Salary for each team", x = "team in 2021-22",
y = "yearly salary in 2022-23 (USD)") + theme(axis.text.x = element_text(angle = 45,
hjust = 1)) + guides(fill = guide_legend(title = "Team")))
ggplotly(ggplot(all, aes(x = fct_reorder(as.factor(team_2022),
salary, median, .desc = TRUE), y = salary, fill = reorder(as.factor(team_2022),
salary, .fun = "mean", decreasing = TRUE))) + geom_boxplot() +
labs(title = "Salary for each team", x = "team in 2022-23",
y = "yearly salary in 2022-23 (USD)") + theme(axis.text.x = element_text(angle = 45,
hjust = 1)) + guides(fill = guide_legend(title = "Team")))
cor(as.numeric(fct_reorder(as.factor(all$team_2021), all$salary,
median)), all$salary)
## [1] 0.2314913
cor(as.numeric(fct_reorder(as.factor(all$team_2022), all$salary,
median)), all$salary)
## [1] 0.1691273
There is no clear correlation between salary and team as each team will have varying salary for their star players and bench players. I will remove this variable.
all <- select(all, !c(team_2021, team_2022))
I will also remove this variable as this might be a direct indication to the salary of the player.
all <- select(all, !Signed.Using)
Although I have already done some visualisation, I will visualize it again.
summary(all$salary)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 125000 2200350 5921176 10129787 13980271 48070014
ggplotly(ggplot(data = all, aes(x = salary)) + geom_histogram(bins = 50) +
labs(title = "Frequency distribution of salary", x = "yearly salary (USD)",
y = "frequency") + theme_minimal())
ggplotly(ggplot(data = all, aes(x = "", y = salary)) + geom_boxplot() +
labs(y = "salary") + coord_flip() + theme_minimal())
g1 <- ggplot(all, aes(x = Age, y = salary)) + geom_point(alpha = 0.7) +
theme_minimal()
ggMarginal(g1, type = "boxplot")
inTrain <- sample(0:1, nrow(all), replace = TRUE, prob = c(0.8,
0.2))
train <- all[inTrain == 0, ]
test <- all[inTrain == 1, ]
mod_rf <- randomForest(salary ~ ., train %>%
drop_na(), ntree = 500, importance = T)
imp_rf <- importance(mod_rf)
imp_df <- data.frame(Variables = row.names(imp_rf), MSE = imp_rf[,
1])
imp_df <- imp_df[order(imp_df$MSE, decreasing = TRUE), ]
ggplot(imp_df, aes(x = reorder(Variables, MSE), y = MSE, fill = MSE)) +
geom_bar(stat = "identity") + labs(x = "Variables", y = "% increase MSE if variable is randomly permuted") +
coord_flip() + theme(legend.position = "none")
pred <- predict(mod_rf, newdata = test)
mod <- lm(salary ~ ., data = train)